if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
## Bioconductor version '3.18' is out-of-date; the current release version '3.19'
##   is available with R version '4.4'; see https://bioconductor.org/install
BiocManager::install("treeio")
## 'getOption("repos")' replaces Bioconductor standard repositories, see
## 'help("repositories", package = "BiocManager")' for details.
## Replacement repositories:
##     CRAN: http://rspm/default/__linux__/focal/latest
## Bioconductor version 3.18 (BiocManager 1.30.23), R 4.3.3 (2024-02-29)
## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'treeio'
## Installation paths not writeable, unable to update packages
##   path: /opt/R/4.3.3/lib/R/library
##   packages:
##     boot, codetools, KernSmooth, lattice, survival
BiocManager::install("ggtreeExtra")
## 'getOption("repos")' replaces Bioconductor standard repositories, see
## 'help("repositories", package = "BiocManager")' for details.
## Replacement repositories:
##     CRAN: http://rspm/default/__linux__/focal/latest
## Bioconductor version 3.18 (BiocManager 1.30.23), R 4.3.3 (2024-02-29)
## Warning: package(s) not installed when version(s) same as or greater than current; use
##   `force = TRUE` to re-install: 'ggtreeExtra'
## Installation paths not writeable, unable to update packages
##   path: /opt/R/4.3.3/lib/R/library
##   packages:
##     boot, codetools, KernSmooth, lattice, survival
# Loading all the libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(knitr)
library(ggtree)
## ggtree v3.10.1 For help: https://yulab-smu.top/treedata-book/
## 
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
## 
## Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam.
## ggtree: an R package for visualization and annotation of phylogenetic
## trees with their covariates and other associated data. Methods in
## Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
## 
## G Yu. Data Integration, Manipulation and Visualization of Phylogenetic
## Trees (1st ed.). Chapman and Hall/CRC. 2022. ISBN: 9781032233574
## 
## Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods
## for mapping and visualizing associated data on phylogeny using ggtree.
## Molecular Biology and Evolution. 2018, 35(12):3041-3043.
## doi:10.1093/molbev/msy194
## 
## Attaching package: 'ggtree'
## 
## The following object is masked from 'package:tidyr':
## 
##     expand
library(TDbook) #A Companion Package for the Book "Data Integration, Manipulation and Visualization of Phylogenetic Trees" by Guangchuang Yu (2022, ISBN:9781032233574).
library(ggimage)
library(rphylopic)
## You are using rphylopic v.1.4.0. Please remember to credit PhyloPic contributors (hint: `get_attribution()`) and cite rphylopic in your work (hint: `citation("rphylopic")`).
## 
## Attaching package: 'rphylopic'
## 
## The following object is masked from 'package:ggimage':
## 
##     geom_phylopic
library(treeio)
## treeio v1.26.0 For help: https://yulab-smu.top/treedata-book/
## 
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
## 
## LG Wang, TTY Lam, S Xu, Z Dai, L Zhou, T Feng, P Guo, CW Dunn, BR
## Jones, T Bradley, H Zhu, Y Guan, Y Jiang, G Yu. treeio: an R package
## for phylogenetic tree input and output with richly annotated and
## associated data. Molecular Biology and Evolution. 2020, 37(2):599-603.
## doi: 10.1093/molbev/msz240
## 
## Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods
## for mapping and visualizing associated data on phylogeny using ggtree.
## Molecular Biology and Evolution. 2018, 35(12):3041-3043.
## doi:10.1093/molbev/msy194
## 
## Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam.
## ggtree: an R package for visualization and annotation of phylogenetic
## trees with their covariates and other associated data. Methods in
## Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
library(tidytree)
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
## 
## Guangchuang Yu.  Data Integration, Manipulation and Visualization of
## Phylogenetic Trees (1st edition). Chapman and Hall/CRC. 2022,
## doi:10.1201/9781003279242
## 
## S Xu, Z Dai, P Guo, X Fu, S Liu, L Zhou, W Tang, T Feng, M Chen, L
## Zhan, T Wu, E Hu, Y Jiang, X Bo, G Yu. ggtreeExtra: Compact
## visualization of richly annotated phylogenetic data. Molecular Biology
## and Evolution. 2021, 38(9):4039-4042. doi: 10.1093/molbev/msab166
## 
## Attaching package: 'tidytree'
## 
## The following object is masked from 'package:treeio':
## 
##     getNodeNum
## 
## The following object is masked from 'package:stats':
## 
##     filter
library(ape)
## 
## Attaching package: 'ape'
## 
## The following objects are masked from 'package:tidytree':
## 
##     drop.tip, keep.tip
## 
## The following object is masked from 'package:treeio':
## 
##     drop.tip
## 
## The following object is masked from 'package:ggtree':
## 
##     rotate
## 
## The following object is masked from 'package:dplyr':
## 
##     where
library(TreeTools)
## 
## Attaching package: 'TreeTools'
## 
## The following object is masked from 'package:tidytree':
## 
##     MRCA
## 
## The following object is masked from 'package:treeio':
## 
##     MRCA
## 
## The following object is masked from 'package:ggtree':
## 
##     MRCA
library(phytools)
## Loading required package: maps
## 
## Attaching package: 'maps'
## 
## The following object is masked from 'package:purrr':
## 
##     map
## 
## 
## Attaching package: 'phytools'
## 
## The following object is masked from 'package:TreeTools':
## 
##     as.multiPhylo
## 
## The following object is masked from 'package:treeio':
## 
##     read.newick
library(ggnewscale)
library(ggtreeExtra)
## ggtreeExtra v1.12.0 For help: https://yulab-smu.top/treedata-book/
## 
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
## 
## S Xu, Z Dai, P Guo, X Fu, S Liu, L Zhou, W Tang, T Feng, M Chen, L
## Zhan, T Wu, E Hu, Y Jiang, X Bo, G Yu. ggtreeExtra: Compact
## visualization of richly annotated phylogenetic data. Molecular Biology
## and Evolution. 2021, 38(9):4039-4042. doi: 10.1093/molbev/msab166
library(ggstar)
library(DT)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

1 Abstract

Using what was learned in Evolutionary Genomics we analyzed bacterium diversity and evolutionary lineage at a specific site in Yellowstone National Park. We also looked and diversity, behavior and evolutionary lineage within the specific phylum Gemmatimonadota.

2 Motovating Reasons

This report is partly a demonstration of the skills learned throughout the course. While I am not the most proficient in R, the data generated here can be applied to real world situations. I wanted to understand the diversity of bacteria that live in Yellowstone National Park as well as the populations and make up of those bacteria. I also wanted to learn more about a particular phylum of bacteria gemmatimonadota, specifically about their evolutionary lineage and their diversity within the phylum.

3 Introduction

Lab group members Monzer, Sophie and Zack were assigned Yellowstone, a NEON site location, and subsequently were assigned to Gemmatimonadota, a phylum of bacteria. As mentioned, Yellowstone National Park (YELL) is a terrestrial NEON site, that is located in Bozeman, MT. Yellowstone consists of rolling hills, and is characterized by a pine-dominated forest with sage and grass wetlands. YELL is highly representative of a wildland area in NEON’s Northern Rockies Domain (D12) and the trophic structure and community interactions are probably more representative of those that were widespread in the region before Euro-American influence than any other place in the Domain. Thus, the site offers a rare opportunity to understand interactions among climate, natural disturbance, ecosystem processes, and community structure in integrated terrestrial and aquatic systems that are representative of those of intact wildlands across the Domain. Gemmatimonadota are a phylum of bacteria established in the year 2003 after it was discovered in a sewage treatment system. They are a gram negative bacterium, can grow using both aerobic and anaerobic respiration, and studies show that they make up about 2% of soil bacterial communities. The phylum contains two classes: Gemmatimonadetes and Longimicrobia. As for where Gemmatimonadota are found, they are distributed in many natural habitats, but they seem to prefer drier soils as they are well adapted to live in low moisture environments. They make up about 2% of soil bacterial communities and have been identified as one of the top nine phyla found in soils; yet, there are currently only six cultured isolates. The phylum Gemmatimonadota contains two classes, Gemmatimonadetes and Longimicrobia. This project was done in concordance with the course Biology 476/676 Evolutionary Genomics & Bioinformatics taught by Professor Jeffrey Blanchard. This course introduces life science students to evolutionary genomics, bioinformatics, and data sciences. Through computer-based lab sessions, students gain skills in Unix command line, R, reproducible research, and cloud computing. These labs also cover DNA sequence searches, sequence alignment, variation detection, phylogenetics, comparative genomics, and genome visualization. The purpose of this final project was to apply the skills acquired in the course to a real-life scenario, specifically focusing on studying the assigned phylum Gemmatimonadota at the Yellowstone National Park site.

4 Methods

In this project, we utilized the Posit Cloud and R Markdown to create graphs and tables to summarize the analyzed data. The NEON metagenome data, collected from the designated sources, underwent thorough analysis using R Studio and R Markdown to extract meaningful insights. Furthermore, we employed phylogeny to handle data downloaded from the Joint Genome Institute (JGI), specifically referencing the GOLD Study ID Gs0161344 for this project.

4.1 This code was used to load the data:

NEON_MAGs_Ind <- read_csv("data/NEON/GOLD_Study_ID_Gs0161344_NEON.csv") %>%
  filter(`Genome Name` != "NEON combined assembly") 
## Rows: 1754 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl  (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date  (1): Date Added
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NEON_MAGs_Ind_tax <- NEON_MAGs_Ind %>% 
  separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus"), "; ", remove = FALSE) 
## Warning: Expected 6 pieces. Additional pieces discarded in 21 rows [12, 32, 66, 79, 80,
## 88, 96, 102, 104, 240, 334, 386, 657, 790, 846, 931, 943, 983, 1041, 1095,
## ...].
## Warning: Expected 6 pieces. Missing pieces filled with `NA` in 282 rows [6, 7, 42, 49,
## 50, 55, 60, 83, 85, 97, 100, 105, 107, 113, 114, 116, 119, 125, 129, 130, ...].
NEON_MAGs <- read_csv("data/NEON/GOLD_Study_ID_Gs0161344_NEON_edArchaea.csv") %>% 
  # remove columns that are not needed for data analysis
  select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`)) %>% 
  # create a new column with the Assembly Type
  mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
                            TRUE ~ "Individual")) %>% 
  mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>% 
  separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus"), "; ", remove = FALSE) %>% 
  # Get rid of the the common string "Soil microbial communities from "
  mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>% 
  # Use the first `-` to split the column in two
  separate(`Genome Name`, c("Site","Sample Name"), " - ") %>% 
  # Get rid of the the common string "S-comp-1"
  mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
  # separate the Sample Name into Site ID and plot info
  separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>% 
  # separate the plot info into 3 columns
  separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
## Rows: 1754 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl  (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date  (1): Date Added
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 6 pieces. Additional pieces discarded in 46 rows [3, 4, 24, 25, 26,
## 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 54, 232, 267, ...].
## Warning: Expected 6 pieces. Missing pieces filled with `NA` in 446 rows [1, 2, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 46, 50, 53, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 624 rows [4, 7, 8, 236,
## 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
## ...].

4.2 This code was run to help filter the data:

# Filters for Yellowstone
NEON_MAGs_Yellow <- NEON_MAGs %>%
  filter(`Site ID`=="YELL")
# Filters for Gemmatimonadota
NEON_MAGs_Gemmatimonadota <-NEON_MAGs %>%
  filter(Phylum=="Gemmatimonadota")
NEON_chemistry <- read_tsv("data/NEON/neon_plot_soilChem1_metadata.tsv") %>% 
  # remove -COMP from genomicsSampleID
  mutate_at("genomicsSampleID", str_replace, "-COMP", "") 
## Rows: 87 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr   (5): genomicsSampleID, siteID, plotID, nlcdClass, horizon
## dbl  (11): decimalLatitude, decimalLongitude, elevation, soilTemp, d15N, org...
## date  (1): collectionDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NEON_MAGs_metagenomes <- read_tsv("data/NEON/exported_img_data_Gs0161344_NEON.tsv") %>% 
  rename(`Genome Name` = `Genome Name / Sample Name`) %>% 
  filter(str_detect(`Genome Name`, 're-annotation', negate = T)) %>% 
  filter(str_detect(`Genome Name`, 'WREF plot', negate = T)) 
## Rows: 176 Columns: 46
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (18): Domain, Sequencing Status, Study Name, Genome Name / Sample Name, ...
## dbl (16): taxon_oid, IMG Genome ID, Depth In Meters, Elevation In Meters, Ge...
## lgl (12): Altitude In Meters, Chlorophyll Concentration, Longhurst Code, Lon...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
 NEON_MAGs_metagenomes<- NEON_MAGs_metagenomes %>% 
  # Get rid of the the common string "Soil microbial communities from "
  mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>% 
  # Use the first `-` to split the column in two
  separate(`Genome Name`, c("Site","Sample Name"), " - ") %>% 
  # Get rid of the the common string "-comp-1"
  mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
  # separate the Sample Name into Site ID and plot info
  separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>% 
  # separate the plot info into 3 columns
  separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-") 
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [53].
NEON_Full <- NEON_MAGs%>%
  left_join(NEON_chemistry, by = c("Sample Name" = "genomicsSampleID")) %>%
  left_join(NEON_MAGs_metagenomes, by = "Sample Name")

4.3 This Code Used to Generate Data Files for Sankey Plots

NEON_MAGs_Sankey <- read_csv("data/NEON/GOLD_Study_ID_Gs0161344_NEON_edArchaea.csv") %>% 
  # remove columns that are not needed for data analysis
  select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`)) %>% 
  # create a new column with the Assembly Type
  mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
                            TRUE ~ "Individual")) %>% 
  mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>% 
  separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus"), "; ", remove = FALSE) %>% 
  # Get rid of the the common string "Soil microbial communities from "
  mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>% 
  # Use the first `-` to split the column in two
  separate(`Genome Name`, c("Site","Sample Name"), " - ") %>% 
  # Get rid of the the common string "S-comp-1"
  mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
  # separate the Sample Name into Site ID and plot info
  separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>% 
  # separate the plot info into 3 columns
  separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-") 
## Rows: 1754 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl  (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date  (1): Date Added
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 6 pieces. Additional pieces discarded in 46 rows [3, 4, 24, 25, 26,
## 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 54, 232, 267, ...].
## Warning: Expected 6 pieces. Missing pieces filled with `NA` in 446 rows [1, 2, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 46, 50, 53, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 624 rows [4, 7, 8, 236,
## 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
## ...].
NEON_MAGs_ind_Sankey <- NEON_MAGs_Sankey %>% 
  filter(`Assembly Type` == "Individual") 
NEON_MAGs_co_Sankey <- NEON_MAGs_Sankey %>% 
  filter(`Assembly Type` == "Combined") 
# Select the GTDB Taxonomic lineage and separate into taxonomic levels
sankey_data <- NEON_MAGs_co_Sankey %>% 
  select(`GTDB-Tk Taxonomy Lineage`) %>% 
  # NAs are likely Archaea
  replace_na(list(`GTDB-Tk Taxonomy Lineage` = 'Archaea')) %>% 
  # Pavian format requires p__ etc
  separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), "; ") 
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 615 rows [2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, ...].
# Fill in the NAs with the taxonomic the higher taxonomic level to the left
sankey_data[] <- t(apply(sankey_data, 1, zoo::na.locf))

# Put the data into a format that can be read by the Sankey App

sankey_data <- sankey_data %>% 
  unite(col = "classification", c(Domain, Phylum, Class, Order, Family, Genus, Species), sep='; ') %>% 
  mutate_at("classification", str_replace, "Archaea", "d__Archaea") %>% 
  mutate_at("classification", str_replace, "Bacteria", "d__Bacteria") %>%  
  mutate_at("classification", str_replace, "; ", "|p__") %>% 
  mutate_at("classification", str_replace, "; ", "|c__") %>% 
  mutate_at("classification", str_replace, "; ", "|o__") %>% 
  mutate_at("classification", str_replace, "; ", "|f__") %>% 
  mutate_at("classification", str_replace, "; ", "|g__") %>% 
  mutate_at("classification", str_replace, "; ", "|s__")  

# Create format for Pavian with counts for each taxonomic level
sankey_data_s <- sankey_data
sankey_data_g <- sankey_data
sankey_data_f <- sankey_data
sankey_data_o <- sankey_data
sankey_data_c <- sankey_data
sankey_data_p <- sankey_data
sankey_data_d <- sankey_data

sankey_data_g$classification <- sub("\\|s__.*", "", sankey_data_g$classification)  
sankey_data_f$classification <- sub("\\|g__.*", "", sankey_data_f$classification)  
sankey_data_o$classification <- sub("\\|f__.*", "", sankey_data_o$classification)  
sankey_data_c$classification <- sub("\\|o__.*", "", sankey_data_c$classification)  
sankey_data_p$classification <- sub("\\|c__.*", "", sankey_data_p$classification)  
sankey_data_d$classification <- sub("\\|p__.*", "", sankey_data_d$classification)  

sankey_data_allTaxa <- bind_rows(sankey_data_s, sankey_data_g, sankey_data_f, sankey_data_o, sankey_data_c, sankey_data_p, sankey_data_d) %>% 
  mutate(classification = as.factor(classification)) %>% 
  count(classification) %>% 
# rename for Pavian format
  rename(`#SampleID` = `classification`) %>% 
  rename(`Metaphlan2_Analysis` = `n`) 

# Write file to input to Pavian Sankey
write_tsv(sankey_data_allTaxa, "data/NEON/NEON_MAG_co_pavian.txt")
NEON_MAGs_ind_Gemmatimonadota <- NEON_MAGs_ind_Sankey %>% 
  filter(Phylum == "Gemmatimonadota") 
sankey_data <- NEON_MAGs_ind_Gemmatimonadota %>% 
  select(`GTDB-Tk Taxonomy Lineage`) %>% 
  # NAs are likely Archaea
  replace_na(list(`GTDB-Tk Taxonomy Lineage` = 'Archaea')) %>% 
  # Pavian format requires p__ etc
  separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), "; ") 
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 16 rows [1, 2, 3, 4, 5,
## 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16].
# Fill in the NAs with the taxonomic the higher taxonomic level to the left
sankey_data[] <- t(apply(sankey_data, 1, zoo::na.locf))

# Put the data into a format that can be read by the Sankey App

sankey_data <- sankey_data %>% 
  unite(col = "classification", c(Domain, Phylum, Class, Order, Family, Genus, Species), sep='; ') %>% 
  mutate_at("classification", str_replace, "Archaea", "d__Archaea") %>% 
  mutate_at("classification", str_replace, "Bacteria", "d__Bacteria") %>%  
  mutate_at("classification", str_replace, "; ", "|p__") %>% 
  mutate_at("classification", str_replace, "; ", "|c__") %>% 
  mutate_at("classification", str_replace, "; ", "|o__") %>% 
  mutate_at("classification", str_replace, "; ", "|f__") %>% 
  mutate_at("classification", str_replace, "; ", "|g__") %>% 
  mutate_at("classification", str_replace, "; ", "|s__")  

# Create format for Pavian with counts for each taxonomic level
sankey_data_s <- sankey_data
sankey_data_g <- sankey_data
sankey_data_f <- sankey_data
sankey_data_o <- sankey_data
sankey_data_c <- sankey_data
sankey_data_p <- sankey_data
sankey_data_d <- sankey_data

sankey_data_g$classification <- sub("\\|s__.*", "", sankey_data_g$classification)  
sankey_data_f$classification <- sub("\\|g__.*", "", sankey_data_f$classification)  
sankey_data_o$classification <- sub("\\|f__.*", "", sankey_data_o$classification)  
sankey_data_c$classification <- sub("\\|o__.*", "", sankey_data_c$classification)  
sankey_data_p$classification <- sub("\\|c__.*", "", sankey_data_p$classification)  
sankey_data_d$classification <- sub("\\|p__.*", "", sankey_data_d$classification)  

sankey_data_ind_GEM <- bind_rows(sankey_data_s, sankey_data_g, sankey_data_f, sankey_data_o, sankey_data_c, sankey_data_p, sankey_data_d) %>% 
  mutate(classification = as.factor(classification)) %>% 
  count(classification) %>% 
# rename for Pavian format
  rename(`#SampleID` = `classification`) %>% 
  rename(`Metaphlan2_Analysis` = `n`) 

# Write file to input to Pavian Sankey
write_tsv(sankey_data_ind_GEM, "NEON_MAG_ind_pavian.txt")
NEON_MAGs_co_Gemmatimonadota <- NEON_MAGs_co_Sankey %>% 
  filter(Phylum == "Gemmatimonadota") 
sankey_data <- NEON_MAGs_co_Gemmatimonadota %>% 
  select(`GTDB-Tk Taxonomy Lineage`) %>% 
  # NAs are likely Archaea
  replace_na(list(`GTDB-Tk Taxonomy Lineage` = 'Archaea')) %>% 
  # Pavian format requires p__ etc
  separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), "; ") 
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 6 rows [1, 2, 3, 4, 5,
## 6].
# Fill in the NAs with the taxonomic the higher taxonomic level to the left
sankey_data[] <- t(apply(sankey_data, 1, zoo::na.locf))

# Put the data into a format that can be read by the Sankey App

sankey_data <- sankey_data %>% 
  unite(col = "classification", c(Domain, Phylum, Class, Order, Family, Genus, Species), sep='; ') %>% 
  mutate_at("classification", str_replace, "Archaea", "d__Archaea") %>% 
  mutate_at("classification", str_replace, "Bacteria", "d__Bacteria") %>%  
  mutate_at("classification", str_replace, "; ", "|p__") %>% 
  mutate_at("classification", str_replace, "; ", "|c__") %>% 
  mutate_at("classification", str_replace, "; ", "|o__") %>% 
  mutate_at("classification", str_replace, "; ", "|f__") %>% 
  mutate_at("classification", str_replace, "; ", "|g__") %>% 
  mutate_at("classification", str_replace, "; ", "|s__")  

# Create format for Pavian with counts for each taxonomic level
sankey_data_s <- sankey_data
sankey_data_g <- sankey_data
sankey_data_f <- sankey_data
sankey_data_o <- sankey_data
sankey_data_c <- sankey_data
sankey_data_p <- sankey_data
sankey_data_d <- sankey_data

sankey_data_g$classification <- sub("\\|s__.*", "", sankey_data_g$classification)  
sankey_data_f$classification <- sub("\\|g__.*", "", sankey_data_f$classification)  
sankey_data_o$classification <- sub("\\|f__.*", "", sankey_data_o$classification)  
sankey_data_c$classification <- sub("\\|o__.*", "", sankey_data_c$classification)  
sankey_data_p$classification <- sub("\\|c__.*", "", sankey_data_p$classification)  
sankey_data_d$classification <- sub("\\|p__.*", "", sankey_data_d$classification)  

sankey_data_ind_GEM <- bind_rows(sankey_data_s, sankey_data_g, sankey_data_f, sankey_data_o, sankey_data_c, sankey_data_p, sankey_data_d) %>% 
  mutate(classification = as.factor(classification)) %>% 
  count(classification) %>% 
# rename for Pavian format
  rename(`#SampleID` = `classification`) %>% 
  rename(`Metaphlan2_Analysis` = `n`) 

# Write file to input to Pavian Sankey
write_tsv(sankey_data_ind_GEM, "NEON_MAG_co_gem.txt")
NEON_MAGs_YELL <- NEON_MAGs %>%
  filter(`Site ID`== "YELL")
sankey_data <- NEON_MAGs_YELL %>% 
  select(`GTDB-Tk Taxonomy Lineage`) %>% 
  # NAs are likely Archaea
  replace_na(list(`GTDB-Tk Taxonomy Lineage` = 'Archaea')) %>% 
  # Pavian format requires p__ etc
  separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), "; ") 
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 50 rows [1, 2, 3, 4, 5,
## 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Fill in the NAs with the taxonomic the higher taxonomic level to the left
sankey_data[] <- t(apply(sankey_data, 1, zoo::na.locf))

# Put the data into a format that can be read by the Sankey App

sankey_data <- sankey_data %>% 
  unite(col = "classification", c(Domain, Phylum, Class, Order, Family, Genus, Species), sep='; ') %>% 
  mutate_at("classification", str_replace, "Archaea", "d__Archaea") %>% 
  mutate_at("classification", str_replace, "Bacteria", "d__Bacteria") %>%  
  mutate_at("classification", str_replace, "; ", "|p__") %>% 
  mutate_at("classification", str_replace, "; ", "|c__") %>% 
  mutate_at("classification", str_replace, "; ", "|o__") %>% 
  mutate_at("classification", str_replace, "; ", "|f__") %>% 
  mutate_at("classification", str_replace, "; ", "|g__") %>% 
  mutate_at("classification", str_replace, "; ", "|s__")  

# Create format for Pavian with counts for each taxonomic level
sankey_data_s <- sankey_data
sankey_data_g <- sankey_data
sankey_data_f <- sankey_data
sankey_data_o <- sankey_data
sankey_data_c <- sankey_data
sankey_data_p <- sankey_data
sankey_data_d <- sankey_data

sankey_data_g$classification <- sub("\\|s__.*", "", sankey_data_g$classification)  
sankey_data_f$classification <- sub("\\|g__.*", "", sankey_data_f$classification)  
sankey_data_o$classification <- sub("\\|f__.*", "", sankey_data_o$classification)  
sankey_data_c$classification <- sub("\\|o__.*", "", sankey_data_c$classification)  
sankey_data_p$classification <- sub("\\|c__.*", "", sankey_data_p$classification)  
sankey_data_d$classification <- sub("\\|p__.*", "", sankey_data_d$classification)  

sankey_data_ind_GEM <- bind_rows(sankey_data_s, sankey_data_g, sankey_data_f, sankey_data_o, sankey_data_c, sankey_data_p, sankey_data_d) %>% 
  mutate(classification = as.factor(classification)) %>% 
  count(classification) %>% 
# rename for Pavian format
  rename(`#SampleID` = `classification`) %>% 
  rename(`Metaphlan2_Analysis` = `n`) 

# Write file to input to Pavian Sankey
write_tsv(sankey_data_ind_GEM, "NEON_MAG_YELL.txt")

4.4 Preparing Code for Phylogenic Trees

NEON_MAGs_Tree <- read_csv("data/NEON/GOLD_Study_ID_Gs0161344_NEON_2024_4_21.csv") %>% 
  # remove columns that are not needed for data analysis
  select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`, `Bin Lineage`)) %>% 
  # create a new column with the Assembly Type
  mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
                            TRUE ~ "Individual")) %>% 
  mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>% 
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "d__", "") %>%  
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "p__", "") %>% 
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "c__", "") %>% 
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "o__", "") %>% 
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "f__", "") %>% 
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "g__", "") %>% 
  mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "s__", "") %>%
  separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), ";", remove = FALSE) %>% 
  mutate_at("Domain", na_if,"") %>% 
  mutate_at("Phylum", na_if,"") %>% 
  mutate_at("Class", na_if,"") %>% 
  mutate_at("Order", na_if,"") %>% 
  mutate_at("Family", na_if,"") %>% 
  mutate_at("Genus", na_if,"") %>% 
  mutate_at("Species", na_if,"") %>% 
  
  # Get rid of the the common string "Soil microbial communities from "
  mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>% 
  # Use the first `-` to split the column in two
  separate(`Genome Name`, c("Site","Sample Name"), " - ") %>% 
  # Get rid of the the common string "S-comp-1"
  mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
  # separate the Sample Name into Site ID and plot info
  separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>% 
  # separate the plot info into 3 columns
  separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
## Rows: 1754 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl  (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date  (1): Date Added
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 624 rows [1131, 1132,
## 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145,
## 1146, 1147, 1148, 1149, 1150, ...].
NEON_metagenomes <- read_tsv("data/NEON/exported_img_data_Gs0161344_NEON.tsv") %>% 
  select(-c(`Domain`, `Sequencing Status`, `Sequencing Center`)) %>% 
  rename(`Genome Name` = `Genome Name / Sample Name`) %>% 
  filter(str_detect(`Genome Name`, 're-annotation', negate = T)) %>% 
  filter(str_detect(`Genome Name`, 'WREF plot', negate = T)) 
## Rows: 176 Columns: 46
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (18): Domain, Sequencing Status, Study Name, Genome Name / Sample Name, ...
## dbl (16): taxon_oid, IMG Genome ID, Depth In Meters, Elevation In Meters, Ge...
## lgl (12): Altitude In Meters, Chlorophyll Concentration, Longhurst Code, Lon...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NEON_metagenomes <- NEON_metagenomes %>% 
  # Get rid of the the common string "Soil microbial communities from "
  mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>% 
  # Use the first `-` to split the column in two
  separate(`Genome Name`, c("Site","Sample Name"), " - ") %>% 
  # Get rid of the the common string "-comp-1"
  mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
  # separate the Sample Name into Site ID and plot info
  separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>% 
  # separate the plot info into 3 columns
  separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-") 
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [53].
NEON_chemistry <- read_tsv("data/NEON/neon_plot_soilChem1_metadata.tsv") %>% 
  # remove -COMP from genomicsSampleID
  mutate_at("genomicsSampleID", str_replace, "-COMP", "") 
## Rows: 87 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr   (5): genomicsSampleID, siteID, plotID, nlcdClass, horizon
## dbl  (11): decimalLatitude, decimalLongitude, elevation, soilTemp, d15N, org...
## date  (1): collectionDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NEON_MAGs_metagenomes_chemistry <- NEON_MAGs %>% 
  left_join(NEON_metagenomes, by = "Sample Name") %>% 
  left_join(NEON_chemistry, by = c("Sample Name" = "genomicsSampleID")) %>% 
  rename("label" = "Bin ID")
tree_arc <- read.tree("data/NEON/gtdbtk.ar53.decorated.tree")
tree_bac <- read.tree("data/NEON/gtdbtk.bac120.decorated.tree")
# Make a vector with the internal node labels
node_vector_bac = c(tree_bac$tip.label,tree_bac$node.label)

# Search for your Phylum or Class to get the node
grep("Gemmatimonadota", node_vector_bac, value = TRUE)
## [1] "'1.0:p__Gemmatimonadota; c__Gemmatimonadetes'"
match(grep("Gemmatimonadota", node_vector_bac, value = TRUE), node_vector_bac)
## [1] 2485
# First need to preorder tree before extracting. N
tree_bac_preorder <- Preorder(tree_bac)
tree_Gemmatimonadota <- Subtree(tree_bac_preorder, 2485)
NEON_MAGs_metagenomes_chemistry_CLBJ <- NEON_MAGs_metagenomes_chemistry %>% 
  filter(`Site ID.x` == "YELL") %>% 
  filter(Domain == "Bacteria")
# Make a vector of the MAGs labels
CLBJ_MAGs_label <- NEON_MAGs_metagenomes_chemistry_CLBJ$label
# Use appropriate Bacteria or Archaea tree to select your site MAG labels
tree_bac_CLBJ_MAGs <-drop.tip(tree_bac,tree_bac$tip.label[-match(CLBJ_MAGs_label, tree_bac$tip.label)])

5 Results

5.1 Yellowstone

kable(
project_groups <- read_tsv("data/NEON/project_groups.tsv") 
)
## Rows: 18 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (9): Domain, Sequencing Status, Study Name, Genome Name / Sample Name, S...
## dbl (6): taxon_oid, IMG Genome ID, Latitude, Longitude, Genome Size  * assem...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
taxon_oid Domain Sequencing Status Study Name Genome Name / Sample Name Sequencing Center IMG Genome ID Phylum Class GOLD Study ID Ecosystem Subtype Latitude Longitude Genome Size * assembled Gene Count * assembled
3300069261 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_051-M-20210705-comp-1 re-annotation DOE Joint Genome Institute (JGI) 3300069261 Environmental Terrestrial Gs0161344 Temperate forest 44.95428 -110.5416 2764498655 6957571
3300060744 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_016-M-20210708-comp-1 DOE Joint Genome Institute (JGI) 3300060744 Environmental Terrestrial Gs0161344 Temperate forest 44.96577 -110.5837 1035151817 1787875
3300061626 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_048-M-20210707-comp-1 DOE Joint Genome Institute (JGI) 3300061626 Environmental Terrestrial Gs0161344 Temperate forest 44.95127 -110.5366 747543128 1371663
3300069273 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_048-M-20210707-comp-1 re-annotation DOE Joint Genome Institute (JGI) 3300069273 Environmental Terrestrial Gs0161344 Temperate forest 44.95127 -110.5366 1445675622 3657230
3300069238 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_012-O-20210708-comp-1 re-annotation DOE Joint Genome Institute (JGI) 3300069238 Environmental Terrestrial Gs0161344 Temperate forest 44.94461 -110.4337 3125604789 7412587
3300069283 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_002-M-20210706-comp-1 re-annotation DOE Joint Genome Institute (JGI) 3300069283 Environmental Terrestrial Gs0161344 Temperate forest 44.93247 -110.6349 1084617294 2762045
3300061637 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_003-M-20210708-comp-1 DOE Joint Genome Institute (JGI) 3300061637 Environmental Terrestrial Gs0161344 Temperate forest 44.95478 -110.5332 1393141085 2453808
3300069234 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_016-M-20210708-comp-1 re-annotation DOE Joint Genome Institute (JGI) 3300069234 Environmental Terrestrial Gs0161344 Temperate forest 44.96577 -110.5837 1697741488 3962632
3300060647 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_009-M-20210706-comp-1 DOE Joint Genome Institute (JGI) 3300060647 Environmental Terrestrial Gs0161344 Temperate forest 44.97031 -110.5019 1136012974 1986999
3300061625 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_051-M-20210705-comp-1 DOE Joint Genome Institute (JGI) 3300061625 Environmental Terrestrial Gs0161344 Temperate forest 44.95428 -110.5416 1465893308 2677575
3300061175 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_012-O-20210708-comp-1 DOE Joint Genome Institute (JGI) 3300061175 Environmental Terrestrial Gs0161344 Temperate forest 44.94461 -110.4337 1813525940 3135710
3300060662 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_005-M-20210708-comp-1 DOE Joint Genome Institute (JGI) 3300060662 Environmental Terrestrial Gs0161344 Temperate forest 44.94838 -110.6314 773391452 1440274
3300069268 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_046-M-20210705-comp-1 re-annotation DOE Joint Genome Institute (JGI) 3300069268 Environmental Terrestrial Gs0161344 Temperate forest 44.95236 -110.5416 1874275737 4830121
3300069213 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_003-M-20210708-comp-1 re-annotation DOE Joint Genome Institute (JGI) 3300069213 Environmental Terrestrial Gs0161344 Temperate forest 44.95478 -110.5332 2438396634 5878161
3300069252 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_005-M-20210708-comp-1 re-annotation DOE Joint Genome Institute (JGI) 3300069252 Environmental Terrestrial Gs0161344 Temperate forest 44.94838 -110.6314 1634258714 4307086
3300060729 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_046-M-20210705-comp-1 DOE Joint Genome Institute (JGI) 3300060729 Environmental Terrestrial Gs0161344 Temperate forest 44.95236 -110.5416 929469631 1698143
3300069242 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_009-M-20210706-comp-1 re-annotation DOE Joint Genome Institute (JGI) 3300069242 Environmental Terrestrial Gs0161344 Temperate forest 44.97031 -110.5019 1986905664 4778529
3300060648 *Microbiome Permanent Draft Terrestrial soil microbial communities from various NEON sites located in USA and Puerto Rico Terrestrial soil microbial communities from Yellowstone NP, Wyoming, USA - YELL_002-M-20210706-comp-1 DOE Joint Genome Institute (JGI) 3300060648 Environmental Terrestrial Gs0161344 Temperate forest 44.93247 -110.6349 559493798 1017974

Table 1: This table shows the different phylums present at Yellowstone National Park as well as some information about them including genome size and gene count

datatable(
  NEON_MAGs_Ind_tax %>% 
    select(c(`Genome Name`, `Class`)) %>%
    filter(str_detect(`Genome Name`, 'YELL')) %>%
    count(Class, sort = TRUE)
)

Table 2: This table shows the different Classes found in Yellowstone Nation Park as well as the number of times they were counted at that site represented by the value (n)

NEON_MAGs_Yellow%>%
  ggplot(aes(y=Phylum))+
  geom_bar()+
  labs(title = "Phylum Counts at Yellowstone National Park")

Figure 1: This bar graph shows the different phylums present at Yellowstone NP and how many times they appeared in the collected samples

NEON_MAGs_Yellow %>%   
ggplot(aes(x = `Total Number of Bases`, y = `Phylum`)) +
  geom_point()+
  labs(title = "Phylum Size at Yellowstone National Park")

Figure 2: This graph shows the various counts of each of the phylums located in Yellowstone NP and the number of bases in their genome. Since every count is shown more popular phylums will appear more.

ggplotly(
  ggplot(data = NEON_MAGs_Yellow, aes(x = `Gene Count`, y = `Scaffold Count`)) +
    geom_point(aes(color = Phylum, shape = Phylum))+
    labs(title = "Phylum Scaffold Count vs Gene Count")
 )
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 7 values. Consider specifying shapes manually if you need
##   that many have them.

Figure 3: This interactive graph shows the relationship between the size of the scaffolds and the genes for each of the different phylum present at the site. The lower the scaffold count the higher the quality of the sample.

NEON_MAGs_Yellow %>% 
ggplot(aes(x=`Gene Count`, y= `Bin Completeness`)) +
  geom_point(aes(color = Phylum))+
  labs(title = "Phylum Completeness vs Gene Count")

Figure 4: This graph shows which phylum have missing single-copy gene markers (Bin Completeness) compared to the gene count. The higher the bin completeness the less missing single-copy gene markers.

NEON_MAGs_Yellow %>% 
  filter(is.na(Phylum) | is.na(Class) | is.na(Order) | is.na(Family) | is.na(Genus)) %>%
ggplot(aes(x = `Bin Quality`)) +
  geom_bar(position = position_dodge2(width = 0.9, preserve = "single")) +
  coord_flip()+
  labs(title = "Phylum Quality at Yellowstone National Park")

Figure 5: This graph shows the number of High Quality (HQ) and Medium Quality (MQ) samples taken at Yellowstone NP

ggtree(tree_bac_CLBJ_MAGs, layout="circular")  %<+%
  NEON_MAGs_metagenomes_chemistry +
  geom_point(mapping=aes(color=Phylum))+
  labs(title = "Phylogenetic Tree for Yellowstone National Park")

Figure 6: This figure shows the ancestry of each of the phylum present at Yellowstone National Park and how they evolved apart from each other.

knitr::include_url("data/sankey-NEON_MAG_YELL.txt.html")

Figure 7: Sankey Tree For the phylum present at Yellowstone National Park

5.2 Gemmatimonadota

datatable(
  NEON_MAGs_Gemmatimonadota %>%
    select(c(`Site`,`GTDB-Tk Taxonomy Lineage`, `Gene Count`, `Total Number of Bases`, `Bin Quality`))
  )

Table 3: This table shows the sites where the bacterium Gemmatimonadota is present as well as how many samples were recovered from each site. Other information like gene and base count is also shown.

NEON_MAGs_Gemmatimonadota %>%   
ggplot(aes(x = Site, y = `GTDB-Tk Taxonomy Lineage`)) +
  geom_point() +
  coord_flip()+
  theme(axis.text.x = element_text(angle=60, vjust=1, hjust=1))+
  labs(title = "Gemmatimonadota Lineages at Each Site", y = "Lineages")

Figure 8: This graph shows where each of the different lineages of gemmatimonadota are present at each site

NEON_MAGs_Gemmatimonadota%>%
  ggplot(aes(y=Site))+
  geom_bar()+
  labs(title= "Gemmatimonadota Counts at Each Site")

Figure 9: This graph shows the total counts of gemmatimonadota at each site

NEON_MAGs_Gemmatimonadota %>% 
  filter(is.na(Phylum) | is.na(Class) | is.na(Order) | is.na(Family) | is.na(Genus)) %>%
ggplot(aes(x = `Bin Quality`)) +
  geom_bar(position = position_dodge2(width = 0.9, preserve = "single")) +
  coord_flip()+
  labs(title = "Gemmatimonadota Quality")

Figure 10: This graph shows the number of High Quality (HQ) and Medium Quality (MQ) samples of gemmatimonadota taken at various sites.

NEON_Gemmatimonadota <- NEON_Full%>%
  filter(`Phylum` == "Gemmatimonadota")

NEON_Gemmatimonadota %>%   
ggplot(aes(x = `Total Number of Bases`, y = `Site.x`)) +
  geom_point()+
  labs(title = "Gemmatimonadota Base Size at Each Site", y = "Site")

Figure 11: This graph shows the size of gemmatimonadota at each site

knitr::include_url("data/sankey-NEON_MAG_ind_pavian.txt.html")

Figure 12: Sankey Tree of the individual assemblies of Gemmatimonadota

knitr::include_url("data/sankey-NEON_MAG_co_gem.txt.html")

Figure 13: Sankey Tree of the combined assemblies of Gemmatimonadota

ggtree(tree_Gemmatimonadota)  %<+%
  NEON_MAGs_metagenomes_chemistry + 
  geom_tiplab(size=2, hjust=-.1) +
  xlim(0,20) +
  geom_point(mapping=aes(color=`Ecosystem Subtype`)) +
  labs(title = "Phylogenetic Tree of Gemmatimonadota")

Figure 14: This figure shows the evolutionary tree of gemmatimonadota as well as which ecosystem each lineage was found in

NEON_Gemmatimonadota %>%   
ggplot(aes(x = `soilTemp`, y = `Site.x`)) +
  geom_boxplot()+
  labs(title = "Soil Temperatures", x = "Soil Temperature", y = "Site")
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Figure 15: This graph shows the average soil temperatures in Celsius for each site where gemmatimonadota was found in.

ggplot(data = NEON_Gemmatimonadota, aes(x = `Ecosystem Subtype`, y = `soilTemp`)) +
    geom_point(aes(color=Family))+
  labs(title = "Soil Temperatures Expanded", y = "Soil Temperature")
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).

Figure 16: This graph shows the soil temperature for each sample as well as which ecosystem it was taken from. The family of gemmatimonadota for each sample is also shown.

6 Discussion

6.1 Yellowstone

The results show that a particular phylum of bacteria seems to be present the most at Yellowstone NP (Figure 1). Actinobacteriota is found in most samples significantly more than any other phylum. Actinobacteriota also has a smaller base count compared to the others (Figure 2). Looking at the phylogenetic tree we can see that Actinobacteriota is very diverse in its own phylum (Figure 6). Its also interesting to see that Chloroflexota hasn’t diverged as much from the last common ancestor shared between all the phylum. We wanted to analyze all of the phylum present in more detail and found that many of the samples were of medium quality (Figure 5). If you take a look at the scaffold count many are quiet high, which means they are of lower quality (Figure 3). To see if the larger the size of the bacterium the lower the quality we compared the two. We can see from the results that for some of the phylum the more genes present the higher the scaffold count. Proteobacteria for example follows an almost linear increase. Patescibacteria has the lowest scaffold count of any phylum, but it also has the fewest genes. Actinobacteriota has more samples with a lower scaffold count than the rest of the phylums. In general the more genes a bacterium has the higher the scaffold count and the lower the quality. We wanted to examine this further so we compared the completeness of each of the phylum to the gene count (Figure 4). This will show us if the reason why the quality is low is because as the gene count becomes larger more parts of the expected genome are missing. The results however don’t show that. The samples with the smallest gene count are also the most incomplete. More analysis needs to be done to figure out why that is the case.

6.2 Gemmatimonadota

Taking a closer look at Gemmatimonadota we can see that it lives in a wide diversity of environments. The sites where the samples were found are spread all across the United States (Figure 8 and 9). This shows the bacterium isn’t clustered in one specific region and can thrive nearly anywhere. There are more samples found in Utah compared to the other sites so maybe this could be where it is most successful. The quality of the samples taken across all the sites is almost evenly split with just a bit more being medium quality (Figure 10). Looking at Figure 11 you’ll see a wide diversity in the size of genome across the sites. The Gemmatimonadota at Tucson Arizona have fewer base pairs than than the Gemmatimonadota at LBJ in Texas. Another interesting factor is that Gemmatimonadota doesn’t have a preferred climate or temperature. The sites where it was found could be arid, dry, cold, hot, or wet (Figure 14 and 15). Looking at the phylogenetic tree in Figure 14 we can see that Gemmatimonadota was found most often in shrublands. Something interesting we found was that two of the samples 33000616338_25 and 3300060695_66 are closely related however one lives in the tundra while the other lives in a temperate forest. Looking at the soild temperatures for each of the sites in figure 16 you’ll see that there is a massive difference between the two temperatures. We believe that because these two samples are closely related and they live in very different climates that Gemmatimonadota has evolved to be temperature resistant or highly adaptable. Further studies should be conducted such as placing one sample of Gemmatimonadota that lives in warm soil and placing it in colder soil then see if it survives or dies.

7 Conclusion

Seven unique phylum live in Yellowstone National Park with the most successful one appearing to be Actinobacteriota. This bacterium vary in genome size with the more successful Actinobacteriota having a smaller genome. We noticed a relationship with the larger the genome count the higher the scaffold count, which means the quality of the samples was lower. We investigated further to see if this was due to larger genomes being incomplete, however that ended up being not the case.

Gemmatimonadota is found across various sites in the United States, and can live in a wide variety of ecosystem subtypes. Two closely related strains can even live in two very different climates which seems to suggest that temperature doesn’t effect Gemmatimonadota as much as other bacterium. The diversity within Gemmatimonadota is also on par with other phylum, like the ones found in Yellowstone National Park.

8 References

Yellowstone National Park NEON | NSF NEON | Open Data to Understand our Ecosystems. (n.d.). Retrieved April 7, 2024, from https://www.neonscience.org/field-sites/yell

Mujakić, I., Piwosz, K., & Koblížek, M. (2022). Phylum Gemmatimonadota and Its Role in the Environment. Microorganisms, 10(1), 151. https://doi.org/10.3390/microorganisms10010151